* Set up log
cd $ohie
cap log close
global sysdate: disp %tdYYNNDD  date("`c(current_date)'", "DMY")
qui log using 	"./logs/extrapolation_slate_$sysdate.log", replace

* Set up timer
timer clear 1
timer on 1 

/*----------------------------------------------------------------------*/
/* PROGRAM: extrapolation_slate.do					*/
/*									*/
/* PURPOSE:								*/
/* [*]	This code uses the linear MTO(p), MUO(p), MTE(p) estimated using*/
/*	the common covariates in the OHIE 1 lottery entrant sample for	*/
/*	estimating the SLATE in the following  scenarios:		*/
/*	- Using Oregon Xs and Oregon ps 				*/
/*	- Using Massachusetts Xs and Massachusetts ps			*/
/*	The code only works for the linear MTE with covariates, where	*/
/*	the covariates are the common covariates (age, gender, English,	*/
/*	and all two-way interactions between those). Using Xs for	*/
/*	a particular state means we use the individual level data from	*/
/*	that data set, and using the ps	from a particular data set means*/
/*	that we use the propensity score coefficients we obtain from	*/
/*	that data. This code outputs data that is used to  plot		*/
/*	the E[MTE(p,X_MA)] line in Figure "extrapolation_or_ma_Y_num" 	*/
/*	of the paper.							*/
/*									*/
/* OUTPUT:								*/
/* [*]	extrapolation_slate: This data set contains data 	*/
/*	for above-mentioned figure.					*/
/*									*/
/*----------------------------------------------------------------------*/

* Set up display options
clear
set type double
set more off, permanently


*************************************************
* OTHER SETUP			*
*************************************************

* Set up seeds
global Y_num_seed = 	6574358
	
*********************************************
* VARIABLES AND CONTROLS	  	*
*********************************************	

* Outcome variables
//local Ys "Y_any Y_charges Y_num"
local Ys "Y_num"

* Number of bootstrap replications
* The first replication of the bootstrap process has no re-sampling; in other
* words the first replication always runs on the full original non-bootstrapped
* sample. This applies to both the OHIE data and the BRFSS data.
local reps = 1001

* Endogenous variables
* The endogenous variables in OHIE and BRFSS can be named differently.
* OHIE
local D_ohie	"any_medicaid"
* BRFSS
local D_brfss	"D"

* Instrument
* The instrument variable must have the same name in both OHIE and BRFSS.
local Z		"Z"

* Define the common covariates in OHIE and BRFSS.
* The common covariates include binary variables for age, gender, and English,
* as well as all two-way interactions between these covariates. These variables
* must have the same name in both the OHIE and the BRFSS data.
* Even though we specify the "Zs" here (i.e., the interaction terms of the
* covariates with Z), the code also creates a separate set of interactions of Z
* with the covariates in order to compute each individual's pB and pI. 
* Therefore,  it is essential to look through the code carefully in case any
* changes are made to the Xs and Zs included in the lists below.
local Xs	"_X*"
local Zs	"_Z*"

* Frequency weight variable
* This variable is essential for extrapolation to the BRFSS data. This variable
* also needs to have the same name in both the OHIE and BRFSS data.
local wt	"w"

* Output Excel file 
local out_file "extrapolation_slate"	

* Define input file for OHIE
local infile_ohie "$final/oregonnumhh1.dta"

* Define input file for BRFSS	
local infile_brfss "$final/brfss.dta"

* Assign names to the data sets being used
local data "ohie brfss"

* Indicate the samples for extrapolation
* We use the following four approaches to extrapolation:
* -	Oregon Xs and Oregon ps (we use the OHIE individual level data and the
*	propensity score coefficients estimated in OHIE)
* -	Massachusetts Xs and Massachusetts ps (we use the BRFSS individual level
*	data and the propensity score coefficients estimated in BRFSS)
local extrap_samples "or_xs_or_ps ma_xs_ma_ps"

*********************************************
* ESTIMATE AND SAVE THE PROPENSITY SCORE COEFFICIENTS IN OHIE AND BRFSS  *
*********************************************	
* In both the OHIE and BRFSS data, estimate the propensity scores using the
* appropriate endogenous variable as the dependent variable, and using the
* common covariates, Z, and the interactions between Z and the covariates as
* the controls in a linear regression model. We also use the frequency weights
* here. We save the coefficients so that we can use them later on.
* We save the propensity score coefficients for every
* bootstrap sample, rather than just the propensity scores for just the 
* full sample. We will eventually use all coefficients appropriately for each
* bootstrap sample.

* Begin looping through data
foreach d of local data {
	
	di "BEGINNING SAVING PROPENSITY SCORE COEFFICIENTS FOR `d'"

	* Begin looping through outcomes
	* In the OHIE 1 lottery entrant sample we drop individuals who have 
	* missing values for the outcome.
	foreach Y of local Ys {
		
		* Initialize a matrix for storing the propensity score 
		* coefficients
		matrix pscores_`d'_`Y' = J(`reps', 300, .)
				
		* Initialize the seed
		set seed $`Y'_seed
		
		* Initialize a list for storing the names of variables for 
		* renaming later
		local renamevars ""
	
		* Begin bootstrapping loop
		forval rep = 1/`reps' {			
			
			* Set up
			use "`infile_`d''", clear
			
			* Drop individuals with missing values
			* (if the OHIE 1 lottery entrant sample)
			if "`d'"=="ohie" {
				keep if `Y' != . 
			}

			* Re-sample for bootstrapping
			quietly if `rep' > 1 {
				bsample
			}


			* Create interaction terms of every covariate with Z
			* We do this because later on we need to re-estimate
			* the propensity scores while assuming that the Z value
			* for each individual is the opposite of what we actually
			* observe. Re-creating the interaction terms of the
			* covariates with Z helps in coding that part.
				
			foreach x of varlist `Xs' {						
				gen _int_`x' = `Z'*`x'									
			}				
				
			* Estimate propensity scores using Z, Xs, and the
			* interactions between Z and the Xs as the covariates
			* and D as the dependent variable. We use the
			* appropriate endogenous variable D for each sample
			* (OHIE or BRFSS).
			* We use a linear regression to predict the propensity
			* scores.  Please note that we do not censor the
			* propensity scores here. We keep the uncensored
			* propensity scores for the MTE estimation, however,
			* we do censor them for estimating any treatment effects
			* from an MTE with covariates. Propensity scores are 
			* also not censored when estimating the linear MTE
			* without covariates.	
			
			quietly reg `D_`d'' `Z' `Xs' _int* [pweight=`wt']
				
			* Save the propensity score coefficients to the output
			* matrix
			
			local matrixiter = 1
			
			quietly foreach x of varlist `Xs' _int* `Z' {					
				
				* Save in the matrix
				matrix pscores_`d'_`Y'[`rep', `matrixiter'] = _b[`x']
				local ++matrixiter
				
				* Assign a name for renaming
				if `rep' ==1 {
					local renamevars "`renamevars' _p_`x'"
				}
			}
				
			* Save the constant to the output matrix
			
			* Save in the matrix
			matrix pscores_`d'_`Y'[`rep', `matrixiter'] = _b[_cons]
			local ++matrixiter
			
			* Assign a name for renaming
			if `rep' ==1 {
				local renamevars "`renamevars' _p__cons"
			}
				
		} // end bootstrap loop
		
		* Save output matrix
		svmat double pscores_`d'_`Y'
				
		* Delete extra variables and observations from the matrix
		keep pscores_`d'_`Y'*
		
		gen obs_num = _n
		drop if obs_num>`reps'
		drop obs_num
					
		* Rename variables in the output matrix and clean up
		local i=1		
		foreach var of local renamevars {			
			rename pscores_`d'_`Y'`i' `var'
			local ++i
		}
		
		drop pscores_`d'_`Y'*
			
		* Save the dataset with the propensity score coefficients
		save "$final/pscores_`d'_`Y'.dta", replace
					
	} // end looping through outcomes
	
	di "ENDING SAVING PROPENSITY SCORE COEFFICIENTS FOR `d'"
	
} // end looping through data


*********************************************
* ESTIMATE THE SLATE IN THE OHIE AND THE BRFSS SAMPLES  *
*********************************************	

* Begin looping through outcomes
foreach Y of local Ys {
	
	* Initialize a matrix for storing the results
	matrix slates_`Y' = J(`reps', 200, .)
	
	* Initialize a matrix iterator for outputting to the matrix
	local matrixiter = 1
		
	* Initialize a list for renaming variables later on
	local renamevars ""
		
	* Begin looping through extrapolation samples
	foreach ext of local extrap_samples {
		
		* Initialize the seed for the specific outcome
		set seed $`Y'_seed
			
		* Begin bootstrapping loop
		forval rep = 1/`reps' {	
			
		*********************************************
		* RETRIEVE STORED INFORMATION  	*
		*********************************************
		* In this section we retrieve the linear MTO(p), MUO(p), MTE(p)
		* and the beta coefficients estimated in the OHIE 1 lottery
		* entrant sample using the common covariates. Depending on the
		* extrapolation sample, we also retrieve the appropriate
		* propensity score coefficients we estimated and saved above,
		* either from OHIE or BRFSS.
		
			* Retrieve the linear MTO(p), MUO(p), MTE(p) and beta
			* coefficients estimated using the common covariates
			* in the OHIE 1 lottery sample, specifically the ones
			* saved from the global_polynomial_all_specs.do file.
			
			* Load the saved matrix from 
			* global_polynomial_all_specs.do file
			use "$final/mtedata_common_`Y'_FN1", clear
				
			* Save all the data extracted from OHIE as a matrix
			* We technically do not need to retrieve the MTO(p) and
			* MUO(p) for the SLATE estimation, but we need them in
			* order to use the treatment_effects.ado file.
			mkmat MTO_* MUO_* mte_* _b*, matrix(ohie_data)
				
			* Save each beta coefficient for the appropriate
			* bootstrap sample as a scalar
			local betas "_b*"
				
			foreach x of varlist `betas' {	
			
				* Get column number of the beta coefficient
				local `x'_col = colnumb(ohie_data, "`x'")
				
				* Save as scalar
				scalar define sc_`x' = ohie_data[`rep', ``x'_col']
			}				
			
			* Store the MTO(p), MUO(p), and MTE(p) functions
			* in matrices
		
			* Create blank matrices for each function
			matrix mto_matrix = J(1,2,.)
			matrix muo_matrix = J(1,2,.)
			matrix mte_matrix = J(1,2,.)
			
			local ests "MTO_term_1 MTO_term_2 MUO_term_1 MUO_term_2 mte_term_1 mte_term_2"
			
			* Get column numbers for each function's component
			foreach est of local ests {
				local `est'_col = colnumb(ohie_data, "`est'")
			}
			
			* Save the final matrices for the appropriate
			* bootstrap sample
			
			* mto(p) matrix
			matrix mto_matrix[1,1] = ohie_data[`rep', `MTO_term_1_col']
			matrix mto_matrix[1,2] = ohie_data[`rep', `MTO_term_2_col']
				
			* muo(p) matrix
			matrix muo_matrix[1,1] = ohie_data[`rep', `MUO_term_1_col']
			matrix muo_matrix[1,2] = ohie_data[`rep', `MUO_term_2_col']
			
			* mte(p) matrix
			matrix mte_matrix[1,1] = ohie_data[`rep', `mte_term_1_col']
			matrix mte_matrix[1,2] = ohie_data[`rep', `mte_term_2_col']
											
			* Store the propensity score coefficients in scalars
			
			* Load the propensity score coefficients we saved above.
			* Depending on which extrapolation we are doing, we 
			* load different propensity score coefficients
			
			* If we are doing an extrapolation using the Oregon ps,
			* load the OHIE propensity score coefficients
			if "`ext'"== "or_xs_or_ps" {
				use "$final/pscores_ohie_`Y'.dta", clear
			} 
			* If we are doing an extrapolation using the Massachusetts
			* ps, load the BRFSS propensity score coefficients
			else {
				use "$final/pscores_brfss_`Y'.dta", clear
			}
				
			* Save the propensity scores as a matrix
			mkmat _p*, matrix(pscores)
			
			* Save each propensity score coefficient for the
			* appropriate bootstrap sample as a scalar
			local pscores "_p*"				
			
			foreach p of varlist `pscores'{
			
				* Get column number
				local `p'_col = colnumb(pscores, "`p'")
				
				* Save as scalar
				scalar define sc_`p' = pscores[`rep', ``p'_col']
			}
					
		*********************************************
		* CALCULATE MU AND PROPENSITY SCORES FOR EACH INDIVIDUAL *
		*********************************************
		
			* Set up
			* Depending on what extrapolation sample we are using,
			* use different input data sets
		
			* If we are using Oregon Xs, load the OHIE individual
			* level data
			if "`ext'"== "or_xs_or_ps" {
				use "`infile_ohie'", clear
			} 
			* If we are using Massachusetts Xs, load the BRFSS
			* individual level data
			else {
				use "`infile_brfss'", clear
			}
			
			* Create interaction terms of every covariate with Z
			* We need to do this in order to ensure the variables
			* for the Z-interaction terms are appropriately named				
			foreach x of varlist `Xs' {					
				gen _int_`x' = `Z'*`x'										
			}
			
			* Drop observations where the outcome is missing before
			* re-sampling for the bootstrapping (if using the OHIE 
			* individual level data)
			* The reason to drop observations with a missing outcome
			* before re-sampling is twofold:
			* (a) By dropping missing outcomes, we cannot oversample 
			* individuals with missing values when bootstrapping
			* (b) By dropping missing outcomes, we ensure that 
			* every bootstrap sample is of equal size
			if "`ext'"== "or_xs_or_ps" {
				keep if `Y' != . 
			}

			* Re-sample 
			quietly if `rep' > 1 {
				bsample
				local --matrixiter
				local --matrixiter
				local --matrixiter
			}
							
			* Calculate mu = (beta_T - beta_U)*X, mu_T = beta_T*X
			* and mu_U = beta_U*X
			quietly gen double mu_T = 0
			quietly gen double mu_U = 0
			quietly gen double mu = 0
			quietly foreach x of varlist `Xs' {
				replace mu_T= mu_T + sc__b_1_`x'*`x'
				replace mu_U = mu_U + sc__b_0_`x'*`x'
				replace mu = mu + (sc__b_1_`x'-sc__b_0_`x')*`x'
			}							
			
			* Calculate average mu
			su mu
			
			* Save mu
			matrix slates_`Y'[`rep', `matrixiter++'] = `r(mean)'
			if `rep' ==1 {
				local renamevars "`renamevars' SMTE_mu_`ext'"
			}
			
			
			
				
			* Calculate the propensity score for each individual
			* using their observed Z and the propensity score
			* coefficients we have already loaded
			quietly gen double p_Z = sc__p__cons
			di "`ext'"
			di sc__p__cons
			quietly foreach x of varlist `Xs' _int* `Z' {
				replace p_Z = p_Z + sc__p_`x'*`x'
			}
			
			di "Number of individuals with propensity scores >1"
			count if p_Z>1 & p_Z !=.
					
			di "Number of individuals with propensity scores <0"
			count if p_Z<0 & p_Z !=.
			
			* Drop individuals with missing propensity scores 
			drop if p_Z==.
				
			* Estimate the alternative propensity score, which we
			* define as having Z equal to the opposite of whatever
			* is observed for each individual
			
			* Generate an alternative Z that is the opposite of
			* whatever each individual has and predict the
			* propensity score with that
			gen Z_alt = ~`Z'
			gen old_Z = `Z'
			drop `Z'
			rename Z_alt `Z'
				
			* Generate alternative interactions of Z with 
			* the covariates
			foreach x of varlist `Xs' {							
				gen alt_`x' = `Z'*`x'
				gen old_int_`x'=_int_`x'
				drop _int_`x'
				rename alt_`x' _int_`x'
			}
										
			* Predict the alternative propensity score
			quietly gen double p_Z_alt = sc__p__cons
			quietly foreach x of varlist `Xs' _int* `Z' {
				replace p_Z_alt = p_Z_alt + sc__p_`x'*`x'
			}
			
			* Revert back to the old names
			drop `Z'
			rename old_Z `Z'
			
			foreach x of varlist `Xs' {				
				drop _int_`x'
				rename old_int_`x' _int_`x'				
			}
				
			* Generate pB and pI for each individual
			* (should be the same across subgroups)
					
			* First, determine each individual's pB
				
			* For randomized out individuals, the propensity score 
			* estimated in the above regression is their pB
			gen pB = p_Z if `Z'==0
			
			* For randomized in individuals, the propensity score
			* estimated in the above regression ASSUMING THEIR Z=0
			* is their pB
			replace pB = p_Z_alt if `Z'==1
				
			* Second, determine each individual's pI
			
			* For randomized out individuals, the propensity score
			* estimated in the above regression ASSUMING THEIR Z=1
			* is their pI
			gen pI = p_Z_alt if `Z'==0
			
			* For randomized in individuals, the propensity score
			* estimated in the above regression is their pI
			replace pI = p_Z if `Z'==1
				
			* Third, censor pB and pI 
			* If pB or pI is less than 0, censor them at 0. If pB 
			* or pI is greater than 1, censor them at 1.
				
			gen pB_cens = pB
			replace pB_cens = 0 if pB_cens<0
			replace pB_cens = 1 if pB_cens>1
			
			gen pI_cens = pI
			replace pI_cens = 0 if pI_cens<0
			replace pI_cens = 1 if pI_cens>1
				
			save "$final/`ext'.dta", replace	
								
		*********************************************
		* ESTIMATE THE LATE FOR EACH INDIVIDUAL*				
		*********************************************
		* We use the closed-form expression for calculating the LATE
		* assuming a linear MTE		
			
			* Generate the MTE(X,p) slope and intercept for
			* each individual
			gen MTE_x_p_intercept = mte_matrix[1,1] + mu
			gen MTE_x_p_slope = mte_matrix[1,2]
				
			* Generate the LATE for each individual using the
			* closed form expression
			gen LATE = (1/(pI_cens - pB_cens))*((MTE_x_p_intercept*pI_cens ///
				+ (MTE_x_p_slope/2)*(pI_cens^2)) - (MTE_x_p_intercept*pB_cens ///
				 + (MTE_x_p_slope/2)*(pB_cens^2)))
				
			* If pB is equal to pI, evaluate the MTE(x,p) 
			* at that point
			replace LATE = (MTE_x_p_intercept + MTE_x_p_slope*pI_cens) ///
				if float(pB_cens) == float(pI_cens)
	
		*********************************************
		* ESTIMATE THE SLATE*				
		*********************************************		
			
			* Calculate the probability of being a complier
			* for each individual
					
			* We use the closed form expression for the probability
			* of being a complier
			
			gen prob_IT = 0
			gen prob_BU = 0
			
			if "`ext'"== "or_xs_or_ps" {			
				replace prob_IT = 1 if `D_ohie'==1 & `Z'==1
				replace prob_BU = 1 if `D_ohie'==0 & `Z'==0
			}
			else {
				replace prob_IT = 1 if `D_brfss'==1 & `Z'==1
				replace prob_BU = 1 if `D_brfss'==0 & `Z'==0
			}
			
						
			gen prob_C = ((pI_cens - pB_cens)/pI_cens)*prob_IT + ///
				((pI_cens - pB_cens)/(1-pB_cens))*prob_BU
			replace prob_C=0 if prob_C==.
				
			* Calculate the SLATE
			* In order to calculate the SLATE, we multiply each
			* individual's LATE by his/her probability of being
			* a complier. We take the sum of this product and
			* divide it by the sum of the probabilities of being
			* a complier across the entire sample.
			gen prob_C_LATE = LATE*prob_C
			
			su prob_C_LATE [aweight=`wt'] if prob_C != 0 
			local sum_LATES = `r(sum)'
			
			su prob_C [aweight=`wt'] if prob_C != 0 
			local sum_probs = `r(sum)'
			
			local SLATE_`ext' = `sum_LATES'/`sum_probs'
			
			*Save the MTE Slope
			su MTE_x_p_slope
			matrix slates_`Y'[`rep', `matrixiter++'] = `r(mean)'
			if `rep' ==1 {
				local renamevars "`renamevars' SMTE_term_2_`ext'"
			}
			
			* Save the SLATE
			matrix slates_`Y'[`rep', `matrixiter++'] = `SLATE_`ext''
			if `rep' ==1 {
				local renamevars "`renamevars' SLATE_`ext'"
			}
			
				
		} // end bootstrap loop
		
	} // end data loop

	* Save matrix
	svmat double slates_`Y'
	
	save "$final/slates_`Y'.dta", replace

	* Delete extra variables and observations
	keep slates_`Y'*
	save "$final/slates_`Y'.dta", replace
	
	gen obs_num = _n
	drop if obs_num>`reps'
	drop obs_num
				
	* Rename variables
	local i=1		
	foreach var of local renamevars {			
		cap rename slates_`Y'`i' `var'
		local ++i
	}
	save "$final/slates_`Y'.dta", replace

	
	* Compute the difference in the SLATEs
	gen diff_SLATES = SLATE_or_xs_or_ps - SLATE_ma_xs_ma_ps
	save "$final/slates_`Y'.dta", replace

	
	* Output the bootstrap statistics
	bootstrap_statistics $output `out_file' `Y'
		
} // end outcomes loop

	
timer off 1
timer list 1
local hours = `r(t1)'/3600
di "Computing time is `hours' hours"

qui log close
